# install.packages(c("tidyverse","readr","knitr","plotly","janitor","lubridate","rmarkdown","broom","ggpubr","kableExtra","DT"))

library(tidyverse)
library(readr)
library(knitr)
library(plotly)
library(janitor)
library(lubridate)
library(rmarkdown)
library(broom)
library(ggpubr)
library(kableExtra)
library(DT)
library(grid)  # for unit()

# helpers (optional)
pretty_r2_table <- function(df, caption_text){
  df %>%
    kable("html", caption = caption_text) %>%
    kable_styling(full_width = FALSE) %>%
    row_spec(0, bold = TRUE)
}

pretty_cor_table <- function(x, y, var1_name = NULL, var2_name = NULL) {
  cor_res <- cor.test(x, y)
  tibble(
    Variable1 = var1_name,
    Variable2 = var2_name,
    Correlation = round(cor_res$estimate, 3),
    p.value = signif(cor_res$p.value, 3)
  ) %>%
    kable("html", caption = paste0("Correlation: ", var1_name, " vs ", var2_name)) %>%
    kable_styling(full_width = FALSE) %>%
    row_spec(0, bold = TRUE)
}

set.seed(123)
opts_chunk$set(echo=TRUE, message=FALSE, warning=FALSE)
#set up data for use
#read data, skip empty 2024 column
data <- read_csv("EdStats_v01.csv", show_col_types=FALSE, col_types=cols(`2024`=col_skip()))

# filter for only USA data
data_clean <- data %>%
  filter(grepl("USA", `Country code`))

#make sure no NA cols remain
if (length(colnames(data_clean)[colSums(is.na(data_clean)) == nrow(data_clean)]) == 0) {
  message("No NA cols remain.")
} else {
  message("NA columns remain.")
}
## No NA cols remain.
#save filtered dataset
write_csv(data_clean, "EdStats_USA.csv")
data_attend <- read_csv("EdStats_attend.csv", show_col_types = FALSE)

# Long -> add Type/Level -> filter -> make factors
data_long_attend <- data_attend %>%
  pivot_longer(cols = `1975`:`2022`, names_to="Year", values_to="Value") %>%
  mutate(
    Year = as.numeric(Year),
    Type = case_when(
      str_detect(`Indicator name`, regex("attendance", ignore_case=TRUE)) ~ "Attendance",
      str_detect(`Indicator name`, regex("enrolment|enrollment", ignore_case=TRUE)) ~ "Enrollment",
      TRUE ~ "Other"
    ),
    Level = case_when(
      str_detect(`Indicator name`, regex("primary", ignore_case=TRUE)) ~ "Elementary School",
      str_detect(`Indicator name`, regex("lower secondary", ignore_case=TRUE)) ~ "Middle School",
      str_detect(`Indicator name`, regex("upper secondary", ignore_case=TRUE)) ~ "High School",
      TRUE ~ "Other"
    )
  ) %>%
  filter(Type %in% c("Attendance","Enrollment"),
         Level %in% c("Elementary School","Middle School","High School"),
         !is.na(Value))

data_long_attend <- data_long_attend %>%
  mutate(
    Level = factor(Level, levels = c("Elementary School","Middle School","High School")),
    Type  = factor(Type,  levels = c("Attendance","Enrollment"))
  )

# Long -> WIDE so we have both variables in columns
data_wide <- data_long_attend %>%
  select(Year, Level, Type, Value) %>%
  pivot_wider(names_from = Type, values_from = Value) %>%
  arrange(Level, Year) %>%
  filter(!is.na(Attendance), !is.na(Enrollment))

# quick check
DT::datatable(head(data_wide), caption = "Preview of wide data (by Level × Year)")
# 1) Simple Overall Regression: Does Enrollment predict Attendance across all data?
m_overall <- lm(Attendance ~ Enrollment, data = data_wide)
DT::datatable(broom::tidy(m_overall),
              caption = "Overall Model Coefficients (Attendance ~ Enrollment)")
# 2) Regressions by School Level (nest + glance)
glance_by_level <- data_wide %>%
  tidyr::nest(data = -Level) %>%
  mutate(model = purrr::map(data, ~ lm(Attendance ~ Enrollment, data = .x)),
         stats = purrr::map(model, broom::glance)) %>%
  tidyr::unnest(stats)

DT::datatable(glance_by_level %>% dplyr::select(Level, r.squared, adj.r.squared, p.value, nobs),
              caption = "Model Quality Summary by School Level")
# 3) Multiple Regression with controls (Year + Level)
m_controls <- lm(Attendance ~ Enrollment + Year + Level, data = data_wide)
DT::datatable(broom::tidy(m_controls),
              caption = "Multiple Regression with Year and Level Controls")
# First, create the text labels from our model summary table
panel_labels <- glance_by_level %>%
  transmute(
    Level,
    label = paste0("R² = ", round(r.squared, 3), "\np = ", signif(p.value, 3))
  )

# Determine the best position for the labels on each plot
label_pos <- data_wide %>%
  group_by(Level) %>%
  summarise(x = min(Enrollment, na.rm=T), y = max(Attendance, na.rm=T)) %>%
  left_join(panel_labels, by = "Level")

p_reg <- ggplot(data_wide, aes(x = Enrollment, y = Attendance)) +
  geom_point(aes(color = Level), alpha = 0.8, show.legend = FALSE) +
  geom_smooth(method = "lm", se = FALSE, linetype = "dashed", color = "black") +
  facet_wrap(~Level, scales = "free") +
  geom_label(data = label_pos, aes(x = x, y = y, label = label),
             hjust = 0, vjust = 1, size = 3.5, label.padding = unit(0.3, "lines")) +
  labs(
    title = "Attendance vs. Enrollment with OLS Fits by School Level (USA)",
    subtitle = "Labels show R-squared and model p-value for each level-specific regression",
    x = "Enrollment Rate (%)", y = "Attendance Rate (%)"
  ) +
  theme_minimal(base_size = 14) +
  theme(strip.text = element_text(face = "bold"))

print(p_reg)

# Diagnostic Plots for the Main Overall Model

par(mfrow = c(2, 2))
plot(m_overall)

par(mfrow = c(1, 1)) 
# Interactive Plotly Version for Exploring Data Points 
p_interactive <- ggplot(data_wide, aes(x = Enrollment, y = Attendance)) +
  geom_point(aes(color = Level, text = paste("Year:", Year)), alpha = 0.8) +
  facet_wrap(~Level, scales = "free") +
  labs(
    title = "Interactive Plot of Attendance vs. Enrollment",
    x = "Enrollment Rate (%)", y = "Attendance Rate (%)"
  ) +
  theme_minimal(base_size = 12)

# Convert the ggplot object to a Plotly object
ggplotly(p_interactive, tooltip = "text")